/*
	Purpose: Using the sample of 1940 Census fathers aged 30-50
	         (from 0b), this file creates father income scores
	         at the occupation x race x south x edu level. Missing
	         income is imputed.

	Creates: avgincomes_fathers1940_byrace_bysouth_byedu.dta
*/
clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

	use ./input/Census1940_fathers_ages30to50_forIncomeScores.dta, clear 

	drop if edu==.
	drop if occ1950ej==99
	gen number=1
	
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*
	
*******************
*** TEMPLATES
*******************

* Template 1: occupation x race 
preserve

	collapse (min) race, by(occ1950ej)
	
	expand 2
	bysort occ1950ej: replace race=2 if _n==2

	tempfile occbyrace
	save `occbyrace'

restore 
	
* Template 2: race x south
preserve 
	
	collapse (min) race, by(south_merge)
	expand 2
	bysort south_merge: replace race=2 if _n==2
	
	tempfile south
	save `south'
	
restore

* Template 3: education x race 
preserve

	collapse (min) race, by(edu)	
	expand 2
	bysort edu: replace race=2 if _n==2

	tempfile edubyrace
	save `edubyrace'

restore 
	
* Template 4: occupation x race x south x edu
preserve

	use `occbyrace', clear
	joinby race using `south'
	joinby race using `edubyrace'
	
	tempfile template
	save `template'
	
restore
	
********************************************
*** COLLAPSE (OCC X RACE X SOUTH X EDU)
********************************************

* For imputations
	foreach x in incwage fam_income hh_income {
		bysort occ1950ej race south_merge: egen `x'_ors = mean(`x')
	}

	collapse (rawsum) number  (mean) incwage* fam_income* hh_income*, by(occ1950ej race south_merge edu) 

	tempfile income
	save `income'

* Merge into templates
	use `template'
	merge 1:1 occ1950ej race south_merge edu using `income'
	
	drop _merge
	replace number=0 if number==.

* Count missings
	count if incwage==.
	tab occ1950ej race if incwage==.

	gen flag_1940_byors_byedu = incwage==.

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

********************************************
*** IMPUTATIONS
********************************************
	
* Black lawyers with edu==3 
	
	sort occ1950ej south_merge race edu 
	
	/* Calculate max average income (at occ x race x south level) 
	   to cells with missing income at occ x race x south
	   x edu levels */
	foreach c in incwage fam_income hh_income {
		by occ1950ej south_merge race: egen test = max(`c'_ors)
		replace `c'_ors=test if `c'_ors==.
		drop test 
	}
	
	foreach c in incwage fam_income hh_income {
		gen temp = `c' / `c'_ors //ratio of occ x race x south x edu to occ x race x south

		gen scale_factor =.
		forval i=1(1)5 {
			sum temp if edu==`i' [aw=number] //average ratio across occs within same edu, weighted by number of people in the cell
			replace scale_factor=`r(mean)' if edu==`i' //scale factor is the same for everyone within same edu. (regardless of race or location)
		}
	
		replace `c' = `c'_ors * scale_factor if `c'==. //scale income 
		assert `c'!=. 
		
		drop temp scale_factor `c'_ors
	}

* Count missings
	assert incwage!=.
	
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

**********
* Save
**********
	
	label var incwage "Coarse avg. personal income score (1940 Census), by race, south, and edu"
	label var fam_income "Coarse avg. family income score (1940 Census), by race, south, and edu"
	label var hh_income "Coarse avg. household income score (1940 Census), by race, south, and edu"
	label var flag_1940_byors_byedu "Flag for imputed value"
	label var race "Respondent race"
	label var number "Number of 1940 obs in occ x race x south x edu cell"

	rename incwage avgincwage_1940_byr_south_edu
	rename fam_income avg_faminc_1940_byr_south_edu
	rename hh_income avg_HHinc_1940_byr_south_edu
	rename occ1950ej fatheroccej
	rename number number_1940obs_byr_south_edu

	sort fatheroccej race south_merge edu
	save ./output/avgincomes_fathers1940_byrace_bysouth_byedu.dta, replace
	